In [1]:
%matplotlib inline
In [2]:
#from __future__ import division
import pandas as pd
import numpy as np
from plotnine import *
In [3]:
!ls -lah ../data/*csv
In [4]:
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../data/dfa_mp.offset_{}.win_{}.csv'
output = []
for offset in offsets:
for winsize in winsizes:
df = pd.DataFrame.from_csv(output_tpl.format(offset, winsize))
df['win'] = winsize
df['offset'] = offset
output.append(df)
dfa = pd.concat(output)
In [5]:
dfa['UTR_length'] = dfa['end_x'] - dfa['start_x']
dfa
Out[5]:
In [6]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['ratio_ATCACG'] > 2)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
In [7]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='lowess', span=1/7.)
print(p)
In [11]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='mavg')
print(p)
In [12]:
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='lowess', span=1/5.) \
+ scale_y_continuous(limits=(-1,2.5))
print(p)
In [12]:
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='ma', window=25) \
+ scale_y_continuous(limits=(-1,2.5))
print(p)
In [13]:
p = ggplot(d, aes(x='loglen', y='ratio_ATCACG')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='lowess', span=1/17.)
print(p)
In [14]:
p = ggplot(d, aes(x='loglen', y='ratio_ATCACG')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='ma', window=20)
print(p)
In [15]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['ratio_ATCACG'] > 2)][[
'TSS', 'gene', 'UTR_length',
'ratio_ATCACG','ratio_CGATGT', 'offset', 'win']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
In [16]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1, size=1) \
+ geom_smooth(method='lowess', span=1/5.) \
+ facet_wrap('win')
print(p)
In [17]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1, size=1) \
+ geom_smooth(method='lowess', span=1/5.) \
+ facet_wrap('offset')
print(p)
In [18]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1, size=1) \
+ geom_smooth(method='lowess', span=1/5.) \
+ facet_grid('offset ~ win')
print(p)
In [19]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1, size=1) \
+ geom_smooth(method='ma', window=20) \
+ facet_grid('offset ~ win')
print(p)
In [20]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['UTR_length'] < 600)
& (dfa['ratio_ATCACG'] > 2)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log2(d['ratio_ATCACG'])
d['log+bcm'] = np.log2(d['ratio_CGATGT'])
d['loglen'] = np.log2(d['UTR_length'])
d['diff'] = d['log-bcm'] - d['log+bcm']
d1 = d[['UTR_length', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'
_d = pd.concat([d1, d2])
In [21]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=3) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=20),
axis_text=element_text(size=20))
print(p)
In [22]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['UTR_length'] < 600)
& (dfa['ratio_ATCACG'] > 2)][['TSS', 'win', 'offset', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log2(d['ratio_ATCACG'])
d['log+bcm'] = np.log2(d['ratio_CGATGT'])
d['loglen'] = np.log2(d['UTR_length'])
d1 = d[['UTR_length', 'win', 'offset', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'win', 'offset', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'
_d = pd.concat([d1, d2])
In [23]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=3) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=20),
axis_text=element_text(size=20)) \
+ facet_grid('offset ~ win')
print(p)
In [ ]: